import pandas as pd
import numpy as np
from datetime import datetime
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
# import plotly.plotly as py
# import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
from sklearn.preprocessing import LabelEncoder, StandardScaler
#Settings
%matplotlib inline
pd.options.display.max_columns = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
raw = pd.read_csv('../data/raw/bank-full.csv', sep = ";")
raw.head(3)
df = raw.copy()
df = df.reset_index()
df.info()
df.describe()
age_histogram_distribution = px.histogram(df, x="age", nbins=10,histnorm = 'percent')
age_histogram_distribution.show()
# age_bins = np.arange(10,100,10)
age_bins = [10,30, 40,50,60,96]
#age_group: 10-19; 20-29 etc
df['age_group'] = pd.cut(df['age'], age_bins, include_lowest = True, right = False)
df.head(3)
df['age_group'].value_counts()
sum(df['age_group'].value_counts()) == len(df)
labelencoder = LabelEncoder()
non_numeric_features_df = df.select_dtypes(include = ['object', 'category'])
object_features = df.select_dtypes(include = 'object')
for feature in object_features.columns:
feature_name_label_encode = ('%s_label_encoded' % feature)
df[feature_name_label_encode] = labelencoder.fit_transform(df[feature].astype(str))
df[feature_name_label_encode].value_counts()
df[feature].value_counts()
#change month_label_encode
df['month_label_encoded'] = df['month'].map({'jan':1,
'feb': 2,
'mar': 3,
'apr': 4,
'may': 5,
'jun': 6,
'jul': 7,
'aug': 8,
'sep': 9,
'oct': 10,
'nov': 11,
'dec': 12})
fig = plt.figure(figsize=(12,10))
sns.set()
for i in range(0, len(numeric_features_df.columns), 5):
ax = sns.pairplot(data=numeric_features_df,
x_vars=numeric_features_df.columns[i:i+5],
y_vars=['y_label_encoded'])
plt.tight_layout()
plt.show();
avg_yearly_balance_histogram_distribution = px.histogram(df, x="balance", nbins=5,histnorm = 'percent')
avg_yearly_balance_histogram_distribution.show()
duration_histogram_distribution = px.histogram(df, x="duration", nbins=5,histnorm = 'percent')
duration_histogram_distribution.show()
#duration_clipped_below_2000
px.box(df,y="duration")
duration_bins = [0,60,180,300,4919]
df['duration_category'] = pd.cut(df['duration'], duration_bins, include_lowest = True, right = False)
df['duration_category'].value_counts()
sum(df['duration_category'].value_counts()) == len(df)
px.box(df,y="pdays")
px.histogram(df, x="pdays", nbins=5,histnorm = 'percent')
pdays_bins = [-1,0,180,872]
df['pdays_category'] = pd.cut(df['pdays'], pdays_bins, include_lowest = True, right = False)
sum(df['pdays_category'].value_counts()) == len(df)
len(df.loc[df['pdays'] <0])
px.box(df,y="balance")
balance_bins = [-8019,0,500,3000,102128]
df['balance_category'] = pd.cut(df['balance'], balance_bins, include_lowest = True, right = False)
df['balance_category'].value_counts()
sum(df['balance_category'].value_counts()) == len(df)
df.head()
df['is_second_half_of_month'] = np.where(df['day'].isin(np.arange(16,32)),1,0)
df['is_second_half_of_month'].value_counts()
px.histogram(df, x="campaign",histnorm = 'percent')
df['campaign'].value_counts()
current_engagement_bins = [1,6,10,64]
df['current_engagement_category'] = pd.cut(df['campaign'], current_engagement_bins, include_lowest = True, right = False)
df['current_engagement_category'].value_counts()
sum(df['current_engagement_category'].value_counts()) == len(df)
px.histogram(df, x="previous",histnorm = 'percent')
df['previous'].value_counts()
prev_engagement_bins = [0,1,6,10,276]
df['prev_engagement_category'] = pd.cut(df['previous'], prev_engagement_bins, include_lowest = True, right = False)
df['prev_engagement_category'].value_counts()
sum(df['prev_engagement_category'].value_counts()) == len(df)
# [0, 1) # 0 prior engagement
# [1, 6) #low frequency
# [6, 10) #mid
# [10, 276) #high
df.pivot_table(index = 'prev_engagement_category', columns = ['poutcome','y'], values = 'index', aggfunc = len)
df['total_duration'] = df['campaign'] * df['duration']
px.histogram(df, x="total_duration",histnorm = 'percent')
px.box(df,y="total_duration")
total_duration_bins = [0,180,600,60171]
df['total_duration_category'] = pd.cut(df['total_duration'], total_duration_bins, include_lowest = True, right = False)
sum(df['total_duration_category'].value_counts()) == len(df)
df.pivot_table(index = 'total_duration_category', columns = ['poutcome','y'], values = 'index', aggfunc = len)
df['new_customers'] = np.where((df['pdays']==-1),1,0)
df['new_customers'].value_counts()
# df = df.reset_index()
df.head(2)
df.isnull().sum()
df.shape
with open('../data/interim/df.pkl', 'wb') as to_write: #pre-processed df
pickle.dump(df, to_write)
checkpoint!
with open('../data/interim/df.pkl', 'rb') as read_file:
df = pickle.load(read_file)
Since observations are ordered by date (from May 2008 to November 2010), use this to impute year
df.loc[(df['day']==31) & (df['month_label_encoded']==5)].tail(1)
# .index.values[0]
df.iloc[df.loc[(df['day']==31) & (df['month_label_encoded']==5)].tail(1).index.values[0]:
df.loc[(df['day']==31) & (df['month_label_encoded']==5)].tail(1).index.values[0]+2,:]
df.iloc[28904:28906,:]
sorted(unique_months)
sample['month_label_encoded'].unique()
end_of_the_month_index = []
unique_months = df['month_label_encoded'].unique().tolist()
sample = df.head(1000)
sample.head(2)
for month in unique_months:
end_of_the_month_index.append(sample.loc[(sample['day'].isin([25,26,27,28,29,30,31])) & (sample['month_label_encoded']==month)].tail(1).index.values[0])
print(end_of_the_month_index)
day = np.arange(14,32)
sample.loc[(sample['day'].isin(day)) & (sample['month_label_encoded']==5)].tail(1)
df.head(1)
df['balance_category'].nunique()
df['duration_category'].nunique()
df['num_contacts_current_campaign'] = df['campaign'] - df['previous']
contacts_agg_time = df.groupby(['month_label_encoded', 'day'])['campaign'].sum().reset_index(name="num_of_contacts")
contacts_agg_time = contacts_agg_time.copy()
contacts_agg_time['date'] = contacts_agg_time['day'].astype(str) + '-' + contacts_agg_time['month_label_encoded'].astype(str) + '-' + '2010'
contacts_agg_time['date'] = pd.to_datetime(contacts_agg_time['date'], dayfirst=True)
# contacts_agg_time['date'] = contacts_agg_time['date'].apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d'))
contacts_agg_time.head()
# contacts_agg_time.info()
contacts_agg_time_fig = px.line(contacts_agg_time, x="date", y="num_of_contacts")
contacts_agg_time_fig.show()
contacts_agg_day = df.groupby('day')['campaign'].sum().reset_index(name="num_of_contacts_per_day")
contacts_agg_day.head()
contacts_agg_day_fig = px.line(contacts_agg_day, x="day", y="num_of_contacts_per_day")
contacts_agg_day_fig.show()